Author: Javier Dolado. Collaborator: Daniel Rodriguez

This is the R Markdown document for the talk “Data Analysis in Software Engineering”, given at the University of Alcalá on Tuesday, 9th of February, 2016. This document requires the R system and the Rstudio installed. This document executes all chunks of R code and generates an html document.

The purpose of the talk is to give a hands-on experience of a data analysis in software engineering. We use basic statistical procedures for analysing some public software engineering datasets.

R Markdown

This is an R Markdown presentation. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.

Important: for compiling this document you need to change the paths to your folders where you have downloaded the content of https://github.com/javierdolado/————–, i.e, change */home/javier/DocProjects/PRESI2013/————-.

Outline of the talk


Setting up the programming environment

#  install.packages("rmarkdown")  
#  install.packages("knitr")
setwd("/home/javier/DocProjects/PRESI2013/icebergFeb2016")

will install the package rmarkdown on your R system.

The Aim of Data Analysis and Statistical Learning

Getting the Data

Cleaning the Data

Descriptive statistics

Normality

main.title <- "Area within 2 SD of the mean"
par(mfrow=c(1,2))
plot(function(x) dnorm(x, mean = 0, sd = 1),
xlim = c(-3, 3), main = "SD 1", xlab = "x",
ylab = "", cex = 2)
segments(-2, 0, -2, 0.4)
segments(2, 0, 2, 0.4)
plot(function(x) dnorm(x, mean = 0, sd = 4),
xlim = c(-12, 12), main = "SD 4", xlab = "x",
ylab = "", cex = 2)
segments(-8, 0, -8, 0.1)
segments(8, 0, 8, 0.1)

sample.means <- rep(NA, 1000)
for (i in 1:1000) {
  sample.40 <- rnorm(40, mean = 60, sd = 4)
  sample.means[i] <- mean(sample.40)
}
means40 <- mean(sample.means)
sd40 <- sd(sample.means)
means40
## [1] 59.99446
sd40
## [1] 0.6344591
hist(sample.means)

Getting the Data. Descriptive statistics.

options(digits=3)
path2files <- "~/DocProjects/PRESI2013/icebergFeb2016" # Set here the path to the folder where you have downloaded the files
setwd(path2files) #this command sets the default directory
telecom1 <- read.table("Telecom1.csv", sep=",",header=TRUE, stringsAsFactors=FALSE, dec = ".") #read data
summary(telecom1)
##       size           effort        EstTotal  
##  Min.   :  3.0   Min.   :  24   Min.   : 30  
##  1st Qu.: 37.2   1st Qu.: 119   1st Qu.:142  
##  Median : 68.5   Median : 222   Median :289  
##  Mean   :100.3   Mean   : 284   Mean   :320  
##  3rd Qu.:164.0   3rd Qu.: 352   3rd Qu.:472  
##  Max.   :284.0   Max.   :1116   Max.   :777
par(mfrow=c(1,2)) #two figures per row
size_telecom1 <- telecom1$size
effort_telecom1 <- telecom1$effort
hist(size_telecom1, col="blue")
hist(effort_telecom1, col="blue")

boxplot(size_telecom1)
boxplot(effort_telecom1)

par(mfrow=c(1,1))
qqnorm(size_telecom1, main="Q-Q Plot of 'size'")
qqline(size_telecom1, col=2, lwd=2, lty=2) #draws a line through the first and third quartiles

qqnorm(effort_telecom1,  main="Q-Q Plot of 'effort'")
qqline(effort_telecom1)

- We observe the non-normality of the data.

plot(size_telecom1, effort_telecom1)

China dataset

library(foreign)
china <- read.arff("china.arff")
china_size <- china$AFP
summary(china_size)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       9     100     215     487     438   17500
china_effort <- china$Effort
summary(china_effort)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      26     704    1830    3920    3830   54600
par(mfrow=c(1,2))
hist(china_size, col="blue", xlab="Adjusted Function Points", main="Distribution of AFP")
hist(china_effort, col="blue",xlab="Effort", main="Distribution of Effort")

boxplot(china_size)
boxplot(china_effort)

qqnorm(china_size)
qqline(china_size)
qqnorm(china_effort)
qqline(china_effort)

- We observe the non-normality of the data.

Normality. Galton data

Normalization

China dataset. Correlation.

par(mfrow=c(1,1))
plot(china_size,china_effort)

cor(china_size,china_effort)
## [1] 0.685
cor.test(china_size,china_effort)
## 
##  Pearson's product-moment correlation
## 
## data:  china_size and china_effort
## t = 20, df = 500, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.635 0.729
## sample estimates:
##   cor 
## 0.685
cor(china_size,china_effort, method="spearman")
## [1] 0.649
cor(china_size,china_effort, method="kendall")
## [1] 0.468

Linear Regression

Galton Data

# library(UsingR); data(galton)
par(mfrow=c(1,2))
hist(galton$child,col="blue",breaks=100)
hist(galton$parent,col="blue",breaks=100)

plot(galton$parent,galton$child,pch=1,col="blue", cex=0.4)
lm1 <- lm(galton$child ~ galton$parent)
lines(galton$parent,lm1$fitted,col="red",lwd=3)
plot(galton$parent,lm1$residuals,col="blue",pch=1, cex=0.4)
abline(c(0,0),col="red",lwd=3)

qqnorm(galton$child)

Linear Regression Diagnostics

China dataset. Linear regression. Fitting a linear model to log-log

linmodel_logchina <- lm(logchina_effort ~ logchina_size)
par(mfrow=c(1,1))
plot(logchina_size, logchina_effort)
abline(linmodel_logchina, lwd=3, col=3)

par(mfrow=c(1,2))
plot(linmodel_logchina, ask = FALSE)

linmodel_logchina
## 
## Call:
## lm(formula = logchina_effort ~ logchina_size)
## 
## Coefficients:
##   (Intercept)  logchina_size  
##         3.301          0.768

Building and Validating a Model

China dataset. Split data into Training and Testing

chinaTrain <- read.arff("china3AttSelectedAFPTrain.arff")
nrow(chinaTrain)
## [1] 332
logchina_size <- log(chinaTrain$AFP)
logchina_effort <- log(chinaTrain$Effort)
linmodel_logchina_train <- lm(logchina_effort ~ logchina_size)
par(mfrow=c(1,1))
plot(logchina_size, logchina_effort)
abline(linmodel_logchina_train, lwd=3, col=4)

par(mfrow=c(1,2))
plot(linmodel_logchina_train, ask = FALSE)

linmodel_logchina_train
## 
## Call:
## lm(formula = logchina_effort ~ logchina_size)
## 
## Coefficients:
##   (Intercept)  logchina_size  
##         3.249          0.784

Measures of Evaluation used in Software Engineering

Evaluation of the model in the Testing data (2)

gm_mean = function(x, na.rm=TRUE){
  exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))}

chinaTest <- read.arff("china3AttSelectedAFPTest.arff")
b0 <- linmodel_logchina_train$coefficients[1]
b1 <- linmodel_logchina_train$coefficients[2]
china_size_test <- chinaTest$AFP
actualEffort <- chinaTest$Effort
predEffort <- exp(b0+b1*log(china_size_test))

err <- actualEffort - predEffort  #error or residual
ae <- abs(err)
hist(ae, main="Absolute Error in the China Test data")

mar <- mean(ae)
mre <- ae/actualEffort
mmre <- mean(mre)
mdmre <- median(mre)
gmar <- gm_mean(ae)
mar
## [1] 1867
mmre
## [1] 1.15
mdmre
## [1] 0.551
gmar
## [1] 833
level_pred <- 0.25 #below and above (both)
lowpred <- actualEffort*(1-level_pred)
uppred <-  actualEffort*(1+level_pred)
pred  <-  predEffort <= uppred & predEffort >= lowpred  #pred is a vector with logical values 
Lpred <- sum(pred)/length(pred)
Lpred
## [1] 0.186

Building a Linear Model on the Telecom1 dataset

samplesize <- floor(0.66*nrow(telecom1))
set.seed(012) # to make the partition reproducible
train_idx <- sample(seq_len(nrow(telecom1)), size = samplesize)
telecom1_train <- telecom1[train_idx, ]
telecom1_test <- telecom1[-train_idx, ]

par(mfrow=c(1,1))
# transformation of variables to log-log
xtrain <- log(telecom1_train$size)
ytrain <- log(telecom1_train$effort)
lmtelecom1 <- lm( ytrain ~ xtrain)
plot(xtrain, ytrain)
abline(lmtelecom1, lwd=2, col="blue")

b0_tel1 <- lmtelecom1$coefficients[1]
b1_tel1 <- lmtelecom1$coefficients[2]
# calculate residuals and predicted values
res <- signif(residuals(lmtelecom1), 5)

xtest <- telecom1_test$size
ytest <- telecom1_test$effort
pre_tel1 <- exp(b0+b1*log(xtest))
# plot distances between points and the regression line
plot(xtest, ytest)
curve(exp(b0+b1*log(x)), from=0, to=300, add=TRUE, col="blue", lwd=2)
segments(xtest, ytest, xtest, pre_tel1, col="red")

Building a Linear Model on the Telecom1 dataset with all observations

par(mfrow=c(1,1))
lmtelecom <- lm(effort_telecom1 ~ size_telecom1)
plot(size_telecom1, effort_telecom1)
abline(lmtelecom, lwd=3, col="blue")
# calculate residuals and predicted values
res <- signif(residuals(lmtelecom), 5) 
predicted <- predict(lmtelecom)
# plot distances between points and the regression line
segments(size_telecom1, effort_telecom1, size_telecom1, predicted, col="red")

level_pred <- 0.25 #below and above (both)
lowpred <- effort_telecom1*(1-level_pred)
uppred <-  effort_telecom1*(1+level_pred)
predict_inrange  <-  predicted <= uppred & predicted >= lowpred  #pred is a vector with logical values 
Lpred <- sum(predict_inrange)/length(predict_inrange)
Lpred
## [1] 0.444
#Visually plot lpred
segments(size_telecom1, lowpred, size_telecom1, uppred, col="red", lwd=3)

err_telecom1 <- abs(effort_telecom1 - predicted)
mar_tel1 <- mean(err_telecom1)
mar_tel1
## [1] 125

Standardised Accuracy. MARP0. ChinaTest

estimEffChinaTest <- predEffort  # This will be overwritten, no problem
numruns <- 9999
randguessruns <- rep(0, numruns)
for (i in 1:numruns) { 
  for (j in 1:length(estimEffChinaTest)) {
    estimEffChinaTest[j] <- sample(actualEffort[-j],1)}#replacement with random guessingt    
  randguessruns[i] <- mean(abs(estimEffChinaTest-actualEffort))
  } 
marp0Chinatest <- mean(randguessruns)
marp0Chinatest
## [1] 3956
hist(randguessruns, main="MARP0 distribution of the China dataset")

saChina = (1- mar/marp0Chinatest)*100
saChina
## [1] 52.8

Standardised Accuracy. MARP0. Telecom1

path2files <- "~/DocProjects/PRESI2013/icebergFeb2016"
setwd(path2files)
telecom1 <- read.table("Telecom1.csv", sep=",",header=TRUE, stringsAsFactors=FALSE, dec = ".") #read data
#par(mfrow=c(1,2))
#size <- telecom1[1]$size   not needed now
actualEffTelecom1 <- telecom1[2]$effort
estimEffTelecom1 <- telecom1[3]$EstTotal # this will be overwritten
numruns <- 9999
randguessruns <- rep(0, numruns)
for (i in 1:numruns) { 
  for (j in 1:length(estimEffTelecom1)) {
    estimEffTelecom1[j] <- sample(actualEffTelecom1[-j],1)}#replacement with random guessingt    
  randguessruns[i] <- mean(abs(estimEffTelecom1-actualEffTelecom1))
  } 
marp0telecom1 <- mean(randguessruns)
marp0telecom1
## [1] 271
hist(randguessruns, main="MARP0 distribution of the Telecom1 dataset")

saTelecom1 <- (1- mar_tel1/marp0telecom1)*100
saTelecom1
## [1] 54

MARP0 in the Atkinson dataset

  • For checking results you may use figure Atkinson in Shepperd&MacDonnell
## [1] 281

ISBSG dataset

Confidence Intervals. Bootstrap

set.seed(10)
norsim(sims = 100, n = 36, mu = 100, sigma = 18, conf.level = 0.95)

Nonparametric Bootstrap

library(boot)
hist(ae, main="Absolute Errors of the China Test data")

level_confidence <- 0.95
repetitionsboot <- 9999
samplemean <- function(x, d){return(mean(x[d]))}
b_mean <- boot(ae, samplemean, R=repetitionsboot)
confint_mean_China <- boot.ci(b_mean)
## Warning in boot.ci(b_mean): bootstrap variances needed for studentized
## intervals
confint_mean_China
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 9999 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = b_mean)
## 
## Intervals : 
## Level      Normal              Basic         
## 95%   (1415, 2309 )   (1389, 2282 )  
## 
## Level     Percentile            BCa          
## 95%   (1451, 2345 )   (1488, 2411 )  
## Calculations and Intervals on Original Scale
boot_geom_mean <- function(error_vec){
  log_error <- log(error_vec[error_vec > 0])
  log_error <-log_error[is.finite(log_error)] #remove the -Inf value before calculating the mean, just in case
  samplemean <- function(x, d){return(mean(x[d]))}
  b <- boot(log_error, samplemean, R=repetitionsboot) # with package boot
  # this is a boot for the logs
  return(b)
}
# BCAconfidence interval for the geometric mean
BCAciboot4geommean <- function(b){  
  conf_int <- boot.ci(b, conf=level_confidence, type="bca")$bca #following 10.9 of Ugarte et al.'s book
  conf_int[5] <- exp(conf_int[5]) # the boot was computed with log. Now take the measure back to its previous units
  conf_int[4] <- exp(conf_int[4])
  return (conf_int)
}
# this is a boot object
b_gm <- boot_geom_mean(ae) #"ae" is the absolute error in the China Test data
print(paste0("Geometric Mean of the China Test data: ", round(exp(b_gm$t0), digits=3)))
## [1] "Geometric Mean of the China Test data: 832.55"
b_ci_gm <- BCAciboot4geommean(b_gm)
print(paste0("Confidence Interval: ", round(b_ci_gm[4], digits=3), " - ", round(b_ci_gm[5], digits=3)))
## [1] "Confidence Interval: 675.783 - 1012.201"
# Make a % confidence interval bca
# BCAciboot <- function(b){  
#   conf_int <- boot.ci(b, conf=level_confidence, type="bca")$bca #following 10.9 of Ugarte et al.'s book
#   return (conf_int)
# }

Genetic Programming for Symbolic Regression